R Sys.Date()A 10x genomics scRNA-Seq library was constructed from a mix of 293T and NIH3T3 cells. A mouse and a human cell barcode was selected for pulldown with a biotinylated DNA oligo with LNA bases added every third nucleotide.
Following reamplification the 2 cell libraries were pooled and resequenced together. The raw fastqs were then processed using a Snakemake pipeline, to produce two processed data files:
This RMarkdown document will produce the following figures:
First designate the libraries and the cells that were resampled.
cells <- list(
mouse_human_cell_pulldown = c("GACGTTAGTGCCTGTG",
"CTGATCCCATGACGGA"))
libs <- c(
"original_10x",
"mouse_human_cell_pulldown")
bc_metadat <- read_tsv(file.path(data_dir,
"lna_control",
"fastq",
"original",
"barcodes_from_10x_run.txt"),
col_names = c("cell_id", "barcode_10x"))
## original library to compare against
reflib <- "original_10x"
resampled_libs <- c("mouse_human_cell_pulldown")
## reference resampled lib for resampled vs control plots
resampled_lib <- "mouse_human_cell_pulldown"
## pretty name for libraries
lib_names = c(
original_10x = "Original Library",
mouse_human_cell_pulldown = "Resampled Library"
)
## pretty names for cells
cell_names = c(
"GACGTTAGTGCCTGTG-1" = "Mouse Cell",
"CTGATCCCATGACGGA-1" = "Human Cell")Load and organize a table for each library of read counts per cell per gene, and a table of umi counts per cell per gene.
umis_to_genes <- function(umipath, cells_to_exclude = c("Cell_unmatched")){
umis <- read_tsv(umipath,
col_names = c("barcode_10x",
"umi_molecule",
"count")) %>%
filter(barcode_10x != cells_to_exclude)
mol_fields <- str_count(umis$umi_molecule[1], "::")
if(mol_fields == 2 ){
umis <- separate(umis, umi_molecule,
into = c("seq", "genome", "gene"),
sep = "::") %>%
mutate(gene = str_c(genome, "::", gene))
} else if (mol_fields == 1){
umis <- separate(umis, umi_molecule,
into = c("seq", "gene"),
sep = "::")
} else {
stop("separator :: missing from umi_molecule field")
}
reads <- select(umis,
barcode_10x,
gene,
count)
reads <- group_by(reads,
barcode_10x, gene) %>%
summarize(counts = sum(count))
reads <- spread(reads, barcode_10x, counts,
fill = 0L)
reads
}
## read in umigroups flat file with read counts per umi per gene per cell
## expand out to a read count matrix
umipaths <- file.path(data_dir,
libs,
"umis",
"umigroups.txt.gz")
read_dat <- map(umipaths,
~umis_to_genes(.))
names(read_dat) <- libs
## read in umi_tools count table with umi counts per gene per cell
## drop rows with 0 counts
umi_dat <- map(libs,
~read_tsv(file.path(data_dir,
.x,
"dgematrix",
"dge_matrix.txt")) %>%
select(-matches("Cell_unmatched")) %>%
.[rowSums(.[, -1]) > 0, ])
names(umi_dat) <- libs
# add in cell info, including info for the original sample
cell_obj_mdata <- map(c(cells[1], cells),
~mutate(bc_metadat,
resampled = ifelse(barcode_10x %in% .x,
TRUE,
FALSE)))
names(cell_obj_mdata) <- libsNext organize these tables into simple classes called resampled-sets to keep track of each experiment’s relavant raw, processed, and meta data.
#' simple class to hold info for each experiment
create_sc_obj <- function(umi_df,
read_df,
cell_mdata_df){
x <- list()
class(x) <- "resampled-set"
x$umis <- umi_df
x$reads <- read_df
x$meta_dat <- cell_mdata_df
return(x)
}
sc_objs <- list(umi_dat, read_dat, cell_obj_mdata)
sc_objs <- pmap(sc_objs, create_sc_obj)
rm(umi_dat)
rm(read_dat)Next perform basic processing. 1) generate separate objects to store sparse matrices of umi and read counts. 2) normalize read and umi count data by total library size (sum of all read or umi counts for all cells in the experiment) and report as Reads per million or UMIs per million. 3) Compute per cell metrics (read and umi counts, sequencing saturation)
tidy_to_matrix <- function(df){
df <- as.data.frame(df)
rownames(df) <- df[, 1]
df[, 1] <- NULL
mat <- as.matrix(df)
mat <- as(mat, "sparseMatrix")
return(mat)
}
#' keep both tidy and matrix objs
generate_matrices <- function(sc_obj){
sc_obj$umi_matrix <- tidy_to_matrix(sc_obj$umis)
sc_obj$read_matrix <- tidy_to_matrix(sc_obj$reads)
sc_obj
}
#' normalize by library size (Reads per Million)
norm_libsize <- function(sc_obj){
sc_obj$norm_umi <- 1e6 * sweep(sc_obj$umi_matrix, 2,
sum(as.vector(sc_obj$umi_matrix)), "/")
sc_obj$norm_reads <- 1e6 * sweep(sc_obj$read_matrix, 2,
sum(as.vector(sc_obj$read_matrix)), "/")
sc_obj
}
add_metadata <- function(sc_obj, dat){
if (is.vector(dat)){
new_colname <- deparse(substitute(dat))
df <- data_frame(!!new_colname := dat)
df[[new_colname]] <- dat
df[["cell_id"]] <- names(dat)
sc_obj$meta_dat <- left_join(sc_obj$meta_dat,
df,
by = "cell_id")
} else if (is.data.frame(dat)) {
sc_obj$meta_dat <- left_join(sc_obj$meta_dat,
dat,
by = "cell_id")
}
sc_obj
}
compute_summaries <- function(sc_obj){
## raw counts
total_umis <- colSums(sc_obj$umi_matrix)
names(total_umis) <- colnames(sc_obj$umi_matrix)
total_reads <- colSums(sc_obj$read_matrix)
names(total_reads) <- colnames(sc_obj$read_matrix)
## norm counts
norm_total_umis <- colSums(sc_obj$norm_umi)
names(norm_total_umis) <- colnames(sc_obj$norm_umi)
norm_total_reads <- colSums(sc_obj$norm_reads)
names(norm_total_reads) <- colnames(sc_obj$norm_reads)
sc_obj <- add_metadata(sc_obj, total_umis)
sc_obj <- add_metadata(sc_obj, total_reads)
sc_obj <- add_metadata(sc_obj, norm_total_umis)
sc_obj <- add_metadata(sc_obj, norm_total_reads)
## compute cDNA duplication rate
sc_obj$meta_dat$cDNA_duplication <- 1 - (sc_obj$meta_dat$total_umis /
sc_obj$meta_dat$total_reads)
sc_obj
}
sc_objs <- map(sc_objs, generate_matrices)
sc_objs <- map(sc_objs, norm_libsize)
sc_objs <- map(sc_objs, compute_summaries)Compute enrichment of reads/umis over the original library.
sc_objs <- map(sc_objs,
function(sub_dat){
og_counts <- select(sc_objs[[reflib]]$meta_dat,
og_total_reads = total_reads,
og_total_umis = total_umis,
og_norm_total_umis = norm_total_umis,
og_norm_total_reads = norm_total_reads,
og_cDNA_duplication = cDNA_duplication,
cell_id)
sub_dat$meta_dat <- left_join(sub_dat$meta_dat,
og_counts,
by = "cell_id")
sub_dat$meta_dat <- mutate(sub_dat$meta_dat,
read_proportion = log2( total_reads / og_total_reads),
umi_proportion = log2( total_umis / og_total_umis),
norm_read_proportion = log2( norm_total_reads /
og_norm_total_reads),
norm_umi_proportion = log2( norm_total_umis /
og_norm_total_umis))
sub_dat
})sc_metadat <- map(sc_objs, ~.x$meta_dat) %>%
bind_rows(.id = "library") %>%
mutate(library = factor(library, levels = libs)) %>%
arrange(resampled)
plt <- ggplot(sc_metadat, aes(total_umis, cDNA_duplication)) +
geom_point(aes(color = resampled), size = 0.5) +
labs(x = expression(paste("# of UMIs")),
y = "Sequencing saturation") +
scale_x_log10(labels = scales::comma) +
scale_color_manual(values = color_palette) +
facet_wrap(~library,
labeller = labeller(library = lib_names)) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(legend.position = "top",
plot.margin = unit(c(5.5, 20.5, 5.5, 5.5),
"points"))
pltNote the high level of sequencing saturation (0 = no-duplication, 1 = all duplicates) in the original library. Also note that the libraries tend to have higher saturatioin rates, after subsampling.
save_plot("cDNA_duplication.pdf", plt,
base_aspect_ratio = 1.6)global_plot_theme <- theme(
legend.position = "top",
legend.text = element_text(size = 10),
strip.text = element_text(size = 8))
resampled_metadat <- filter(sc_metadat,
library != reflib) %>%
mutate(library = factor(library,
levels = resampled_libs))
unnorm_plt <- ggplot(resampled_metadat,
aes(og_total_reads / 3, total_reads / 3, colour = resampled)) +
geom_point(size = 0.5) +
geom_abline(slope = 1) +
facet_wrap(~library, nrow = 1) +
coord_fixed() +
xlab("original library\nreads count (Thousands)") +
ylab("resampled library\nreads count (Thousands)") +
# ggtitle("Raw reads associated with each cell barcode") +
scale_colour_manual(name = "resampled:",
values = color_palette) +
theme_cowplot(font_size = 10, line_size = .5) +
global_plot_theme
norm_plt <- ggplot(resampled_metadat, aes(og_norm_total_reads / 1e3, norm_total_reads / 1e3, colour = resampled)) +
geom_point(size = 0.5) +
geom_abline(slope = 1) +
facet_wrap(~library, nrow = 1) +
xlab("original library\nRPM (Thousands)") +
ylab("resampled library\nRPM (Thousands)") +
scale_colour_manual(name = "resampled:",
values = color_palette) +
theme_cowplot(font_size = 10, line_size = 0.5) +
theme(aspect.ratio = 1) +
global_plot_theme
unnorm_umi_plt <- ggplot(resampled_metadat,
aes(og_total_umis / 1e3, total_umis / 1e3, colour = resampled)) +
geom_point(size = 0.5) +
geom_abline(slope = 1) +
coord_fixed() +
facet_wrap(~library, nrow = 1) +
xlab("original library\nUMI count (Thousands)") +
ylab("resampled library\nUMI count (Thousands)") +
scale_colour_manual(name = "resampled:",
values = color_palette) +
theme_cowplot(font_size = 10, line_size = .5) +
global_plot_theme
norm_umi_plt <- ggplot(resampled_metadat,
aes(og_norm_total_umis / 1e3, norm_total_umis / 1e3, colour = resampled)) +
geom_point(size = 0.5) +
geom_abline(slope = 1) +
facet_wrap(~library, nrow = 1) +
xlab("original library\nUMI normalized RPM (Thousands)") +
ylab("resampled library\nUMIs per Million (Thousands)") +
scale_colour_manual(name = "resampled:",
values = color_palette) +
theme_cowplot(font_size = 10, line_size = 0.5) +
theme(aspect.ratio = 1) +
global_plot_theme
plt <- plot_grid(unnorm_plt, norm_plt, unnorm_umi_plt, norm_umi_plt,
labels = "AUTO",
align = 'hv')
pltsave_plot("reads_per_barcode_scatterplots.pdf", plt, base_width = 8 )read <- ggplot(resampled_metadat,
aes(cell_id, norm_read_proportion, colour = resampled)) +
geom_point() +
labs(x = "Cell",
y = expression(paste( " Log"[2], " normalized reads ", frac("resampled", "original")))) +
scale_colour_manual(name = "resampled:", values = color_palette) +
facet_wrap(~library, nrow = 1) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(axis.text.x = element_blank(),
legend.position = "top",
legend.text = element_text(size = 12))
umi <- ggplot(resampled_metadat,
aes(cell_id, norm_umi_proportion, colour = resampled)) +
geom_point() +
labs(x = "Cell",
y = expression(paste( "Log"[2], " normalized UMIs ", frac("resampled", "original")))) +
scale_colour_manual(name = "resampled:", values = color_palette) +
facet_wrap(~library, nrow = 1) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(axis.text.x = element_blank(),
legend.position = "top",
legend.text = element_text(size = 12))
plt <- plot_grid(read, umi,
labels = "AUTO",
align = 'hv')
pltggsave("reads_umi_ratio_per_barcode_normalized.pdf", width = 8, height = 5)
umi_plots <- map(split(resampled_metadat, resampled_metadat$library),
function(x){
ggplot(x,
aes(og_total_umis, norm_umi_proportion, colour = resampled)) +
geom_point(size = 0.5) +
geom_hline(aes(yintercept = 0),
linetype ="dashed",
color = "darkgrey") +
labs(x = "Abundance in original library\n (UMIs)",
y = expression(paste( "Log"[2], " UMIs ",
frac("resampled", "original")))) +
scale_x_continuous(labels = scales::comma) +
scale_colour_manual(name = "resampled:", values = color_palette) +
# facet_wrap(~library, nrow = 1) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(legend.position = "top",
legend.text = element_text(size = 12))})
plt <- plot_grid(plotlist = umi_plots, nrow = 1)
save_plot("umi_ratio_MA.pdf", plt)ggplot(resampled_metadat, aes(resampled,
read_proportion, fill = resampled)) +
geom_boxplot(coef = Inf) +
facet_wrap(~library, nrow = 1) +
xlab("Selected for Reamplification") +
ylab(expression(paste( "Log"[2], " Reads ", frac("resampled", "original")))) +
scale_fill_manual(name = "resampled:",
values = color_palette) +
theme_cowplot(font_size = 16, line_size = 0.5) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "top",
legend.text = element_text(size = 12)
)ggsave("reads_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)
ggplot(resampled_metadat, aes(resampled,
umi_proportion, fill = resampled)) +
geom_boxplot(coef = Inf) +
facet_wrap(~library, nrow = 1) +
xlab("Selected for Reamplification") +
ylab(expression(paste( "Log"[2], " UMIs ", frac("resampled", "original")))) +
scale_fill_manual(name = "resampled:",
values = color_palette) +
theme_cowplot(font_size = 16, line_size = 0.5) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "top",
legend.text = element_text(size = 12)
)ggsave("umi_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)
ggplot(resampled_metadat, aes(resampled,
norm_read_proportion, fill = resampled)) +
geom_boxplot(coef = Inf) +
facet_wrap(~library, nrow = 1) +
xlab("Selected for Reamplification") +
ylab(expression(paste( "Log"[2], " normalized Reads ", frac("resampled", "original")))) +
scale_fill_manual(name = "resampled:",
values = color_palette) +
theme_cowplot(font_size = 16, line_size = 0.5) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "top",
legend.text = element_text(size = 12)
)ggsave("norm_reads_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)
ggplot(resampled_metadat, aes(resampled,
norm_umi_proportion, fill = resampled)) +
geom_boxplot(coef = Inf) +
facet_wrap(~library, nrow = 1) +
xlab("Selected for Reamplification") +
ylab(expression(paste( "Log"[2], " normalized UMIs ", frac("resampled", "original")))) +
scale_fill_manual(name = "resampled:",
values = color_palette) +
theme_cowplot(font_size = 16, line_size = 0.5) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "top",
legend.text = element_text(size = 12)
)ggsave("norm_umi_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)dat <- group_by(resampled_metadat, library) %>%
filter(library != reflib) %>%
mutate(total_new = sum(total_reads, na.rm = T),
total_old = sum(og_total_reads, na.rm = T))
dat_group <- group_by(dat, library, resampled) %>%
summarize(total_new = sum(total_reads,
na.rm = T) / unique(total_new),
total_old = sum(og_total_reads,
na.rm = T) / unique(total_old)) %>%
gather(lib, percent_lib, -library, -resampled ) %>%
mutate(lib = factor(lib, levels = c("total_old", "total_new"),
labels = c("original\nlibrary", "resampled\nlibrary")))
ggplot(dat_group, aes(lib, percent_lib, fill = resampled)) +
geom_bar(stat = "identity") +
ylab("Fraction of\n Reads Assigned") +
scale_fill_manual(name = "resampled:",
values = color_palette) +
facet_wrap(~library) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5),
legend.position = "top",
legend.text = element_text(size = 16)
)ggsave("proportion_reads_all_barcode_barplot.pdf", width = 7, height = 7)
dat_group %>%
rename(Method = library) %>%
spread(lib, percent_lib) %>%
mutate(`Targeted Library Read Fold-Enrichment` = `resampled\nlibrary` / `original\nlibrary`) %>%
filter(resampled == T) %>%
select(Method, `Targeted Library Read Fold-Enrichment`)add_species_counts <- function(sc_obj,
mouse_gene_prefix = "mm38::",
human_gene_prefix = "hg38::"){
## get mouse and human reads
g_ids <- rownames(sc_obj$read_matrix)
mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
mouse_reads = colSums(sc_obj$read_matrix[mouse_ids, ])
human_reads = colSums(sc_obj$read_matrix[human_ids, ])
## get mouse and human UMIs
g_ids <- rownames(sc_obj$umi_matrix)
mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
mouse_umis = colSums(sc_obj$umi_matrix[mouse_ids, ])
human_umis = colSums(sc_obj$umi_matrix[human_ids, ])
## get norm counts for reads
g_ids <- rownames(sc_obj$norm_reads)
mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
norm_human_reads <- colSums(sc_obj$norm_reads[human_ids, ])
norm_mouse_reads <- colSums(sc_obj$norm_reads[mouse_ids, ])
## get norm counts for umis
g_ids <- rownames(sc_obj$norm_umi)
mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
norm_human_umis <- colSums(sc_obj$norm_umi[human_ids, ])
norm_mouse_umis <- colSums(sc_obj$norm_umi[mouse_ids, ])
sc_obj <- add_metadata(sc_obj, human_reads)
sc_obj <- add_metadata(sc_obj, mouse_reads)
sc_obj <- add_metadata(sc_obj, human_umis)
sc_obj <- add_metadata(sc_obj, mouse_umis)
sc_obj <- add_metadata(sc_obj, norm_human_reads)
sc_obj <- add_metadata(sc_obj, norm_mouse_reads)
sc_obj <- add_metadata(sc_obj, norm_human_umis)
sc_obj <- add_metadata(sc_obj, norm_mouse_umis)
## make sure mouse + human == total
stopifnot(all(sc_obj$meta_dat$total_reads == sc_obj$meta_dat$mouse_reads +
sc_obj$meta_dat$human_reads, na.rm = T))
stopifnot(all(sc_obj$meta_dat$total_umis == sc_obj$meta_dat$mouse_umis +
sc_obj$meta_dat$human_umis, na.rm = T))
## check floating point totals
tol <- 1e-5
reads_check <- all(abs(sc_obj$meta_dat$norm_total_reads - (sc_obj$meta_dat$norm_mouse_reads +
sc_obj$meta_dat$norm_human_reads)) <= tol, na.rm = T)
stopifnot(reads_check)
umis_check <- all(abs(sc_obj$meta_dat$norm_total_umis - (sc_obj$meta_dat$norm_mouse_umis +
sc_obj$meta_dat$norm_human_umis)) <= tol, na.rm = T)
stopifnot(umis_check)
## calculate species purity (human / human + mouse)
sc_obj$meta_dat <- mutate(sc_obj$meta_dat,
purity_reads = human_reads / (human_reads + mouse_reads),
purity_umis = human_umis / (human_umis + mouse_umis))
sc_obj
}
sc_objs <- map(sc_objs, add_species_counts)## add in metadat columns for original mouse and original human data
sc_objs <- map(sc_objs,
function(sub_dat){
og_dat <- select(sc_objs[[reflib]]$meta_dat,
cell_id,
str_subset(colnames(sc_objs[[reflib]]$meta_dat),
"human|mouse|purity"))
cols <- colnames(og_dat)
new_cols <- c("cell_id", str_c("og_", cols[2:length(cols)]))
colnames(og_dat) <- new_cols
sub_dat$meta_dat <- left_join(sub_dat$meta_dat,
og_dat,
by = "cell_id")
sub_dat$meta_dat <- mutate(sub_dat$meta_dat,
norm_human_read_proportion = log2( norm_human_reads /
og_norm_human_reads),
norm_human_umi_proportion = log2( norm_human_umis /
og_norm_human_umis),
norm_mouse_read_proportion = log2( norm_mouse_reads /
og_norm_mouse_reads),
norm_mouse_umi_proportion = log2( norm_mouse_umis /
og_norm_mouse_umis))
sub_dat
})
resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>%
bind_rows(.id = "library") %>%
mutate(library = factor(library,
levels = libs)) %>%
arrange(resampled)read_plots <- map(split(resampled_metadat, resampled_metadat$library),
function(x){
ggplot(x,
aes(human_reads / 1e3, mouse_reads / 1e3,
colour = resampled)) +
geom_point(size = 0.5) +
facet_wrap(~library, nrow = 1,
scales = "free",
labeller = labeller(library = lib_names)) +
xlab("Human reads (Thousands)") +
ylab("Mouse reads (Thousands)") +
scale_colour_manual(name = "resampled:",
values = color_palette) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(legend.position = "top",
legend.text = element_text(size = 12))
})
plt <- plot_grid(plotlist = read_plots,
nrow = 1)
save_plot("read_scatterplot_human_mouse.pdf",
plt, ncol = 3, nrow = 1,
base_width = 4, base_height = 4)
umi_plots <- map(split(resampled_metadat, resampled_metadat$library),
function(x){
ggplot(x,
aes(human_umis,
mouse_umis,
colour = resampled)) +
geom_point(size = 0.5) +
facet_wrap(~library, nrow = 1,
scales = "free",
labeller = labeller(library = lib_names)) +
xlab("Human UMIs") +
ylab("Mouse UMIs") +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma) +
scale_colour_manual(name = "resampled:",
values = color_palette) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(legend.position = "top",
legend.text = element_text(size = 12))
})
plt <- plot_grid(plotlist = umi_plots,
nrow = 1)
save_plot("umi_scatterplot_human_mouse.pdf",
plt, ncol = 2, nrow = 1,
base_width = 4, base_height = 4)## compute per gene or per gene/umi combo enrichment
detected_molecules <- function(sc_obj, molecule = "gene"){
umis <- sc_obj$umi_matrix
if (molecule == "gene"){
human_mat <- umis[str_detect(rownames(umis), "hg38::"), ]
mouse_mat <- umis[str_detect(rownames(umis), "mm38::"), ]
n_human_genes <- colSums(human_mat > 0)
n_mouse_genes <- colSums(mouse_mat > 0)
out_mdat <- data_frame(cell_id = colnames(umis),
n_human_genes = n_human_genes,
n_mouse_genes = n_mouse_genes)
sc_obj <- add_metadata(sc_obj, out_mdat)
}
}
sc_objs <- map(sc_objs, ~detected_molecules(.x))resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>%
bind_rows(.id = "library") %>%
mutate(library = factor(library,
levels = libs))
og_genes <- filter(resampled_metadat,
library == reflib) %>%
dplyr::select(cell_id,
og_human_genes = n_human_genes,
og_mouse_genes = n_mouse_genes)
resampled_metadat <- left_join(resampled_metadat,
og_genes,
by = "cell_id")
ggplot(resampled_metadat, aes(n_human_genes,
og_human_genes, colour = resampled)) +
geom_point() +
ylab("resampled_genes") +
xlab("original_genes") +
scale_colour_manual(name = "resampled:", values = color_palette) +
facet_wrap(~library, nrow = 1) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(
legend.position = "top",
legend.text = element_text(size = 16)
)ggplot(resampled_metadat, aes(n_mouse_genes,
og_mouse_genes, colour = resampled)) +
geom_point() +
ylab("resampled_genes") +
xlab("original_genes") +
scale_colour_manual(name = "resampled:", values = color_palette) +
facet_wrap(~library, nrow = 1) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(
legend.position = "top",
legend.text = element_text(size = 16)
)calc_gene_sensitivity <- function(sc_obj,
type = "umi",
mouse_gene_prefix = "mm38::",
human_gene_prefix = "hg38::"){
if (type == "umi"){
count_matrix <- sc_obj$umi_matrix
} else {
count_matrix <- sc_obj$read_matrix
}
# generate list named with barcode of each detected gene and
# respective read/umi count
genes_detected <- apply(count_matrix, 2, function(x) x[x > 0])
sc_obj$genes_detected <- genes_detected
sc_obj
}
sc_objs <- map(sc_objs, calc_gene_sensitivity)sc_objs <- map(sc_objs,
function(x){
og_genes <- sc_objs[[reflib]]$genes_detected
sub_genes <- x$genes_detected
# subset list of cell barcodes to the same as the og experiment
# and also reorders the barcodes to match
sub_genes <- sub_genes[names(og_genes)]
if(length(sub_genes) != length(og_genes)){
stop("barcode lengths not the same")
}
shared_genes <- map2(sub_genes,
og_genes,
~intersect(names(.x),
names(.y)))
new_genes <- map2(sub_genes,
og_genes,
~setdiff(names(.x),
names(.y)))
not_recovered_genes <- map2(og_genes,
sub_genes,
~setdiff(names(.x),
names(.y)))
x$shared_genes <- shared_genes
x$new_genes <- new_genes
x$not_recovered_genes <- not_recovered_genes
return(x)
})
## add gene recovery info to meta data table
sc_objs <- map(sc_objs,
function(x){
shared_genes <- map2_dfr(x$shared_genes,
names(x$shared_genes),
function(x, y){
mouse <- sum(str_detect(x, "^mm38::")) ;
human <- sum(str_detect(x, "^hg38::")) ;
data_frame(cell_id = y,
mouse_shared_genes = mouse,
human_shared_genes = human,
shared_genes = mouse + human)
})
not_recovered_genes <- map2_dfr(x$not_recovered_genes,
names(x$not_recovered_genes),
function(x, y){
mouse <- sum(str_detect(x, "^mm38::")) ;
human <- sum(str_detect(x, "^hg38::")) ;
data_frame(cell_id = y,
mouse_not_recovered_genes = mouse,
human_not_recovered_genes = human,
not_recovered_genes = mouse + human)
})
new_genes <- map2_dfr(x$new_genes,
names(x$new_genes),
function(x, y){
mouse <- sum(str_detect(x, "^mm38::")) ;
human <- sum(str_detect(x, "^hg38::")) ;
data_frame(cell_id = y,
mouse_new_genes = mouse,
human_new_genes = human,
new_genes = mouse + human)
})
gene_mdata <- left_join(shared_genes,
not_recovered_genes,
by = "cell_id") %>%
left_join(., new_genes, by = "cell_id")
x <- add_metadata(x, gene_mdata)
x
})
resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>%
bind_rows(.id = "library") %>%
mutate(library = factor(library,
levels = libs)) %>%
arrange(resampled)genes_recovered <- resampled_metadat %>%
dplyr::filter(library != reflib) %>%
dplyr::select(cell_id,
library,
resampled,
shared_genes,
not_recovered_genes,
new_genes)
genes_recovered <- gather(genes_recovered,
key = type, value = count,
-cell_id, -resampled, -library)
genes_recovered <- mutate(genes_recovered,
type = str_replace_all(type, "_", "\n"))
plt <- ggplot(genes_recovered,
aes(cell_id, count)) +
geom_point(aes(color = resampled),
size = 0.6,
alpha = 0.75) +
facet_grid(type ~ library) +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
strip.text.y = element_text(size = 12,
margin = margin(0,0.2,0,0.2, "cm"))) +
scale_color_manual(values = color_palette)
plt save_plot("new_genes_detected.pdf", plt, base_width = 8, base_height = 8)
targeted <- genes_recovered %>%
dplyr::filter(library == resampled_lib,
resampled)
targetedplt_dat <- genes_recovered %>%
dplyr::filter(library != resampled_lib,
resampled) %>%
group_by(type) %>%
summarize(count = mean(count)) %>%
mutate(cell_id = "Not Targeted Barcodes") %>%
bind_rows(targeted, .) %>%
filter(type == "new\ngenes")
plt <- ggplot(plt_dat,
aes(cell_id, count)) +
geom_bar(aes(fill = cell_id),
stat = "identity") +
labs(y = "Newly detected genes") +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45,
hjust = 1),
legend.position = "none") +
scale_fill_brewer(palette = "Set1")
save_plot("new_genes_barplot.pdf", plt,
base_width = 3.6, base_height = 5)
## species specific
genes_recovered <- resampled_metadat %>%
dplyr::filter(library %in% resampled_libs) %>%
select(cell_id, resampled, purity_reads, ends_with("genes")) %>%
mutate(species = ifelse(purity_reads > 0.80,
"human",
ifelse(purity_reads < 0.20,
"mouse",
"mixed"))) %>%
filter(species != "mixed") %>%
mutate(shared_genes_species = ifelse(species == "human",
human_shared_genes,
mouse_shared_genes),
new_genes_species = ifelse(species == "human",
human_new_genes,
mouse_new_genes),
not_recovered_genes_species = ifelse(species == "human",
human_not_recovered_genes,
mouse_not_recovered_genes)) %>%
select(cell_id, resampled, ends_with("_species")) %>%
gather(key = type, value = count,
-cell_id, -resampled) %>%
mutate(type = str_replace(type, "_species", "") %>%
str_replace_all(., "_", "\n"))
targeted <- genes_recovered %>%
dplyr::filter(resampled)
plt_dat <- genes_recovered %>%
dplyr::filter(!resampled) %>%
group_by(type) %>%
summarize(count = mean(count)) %>%
mutate(cell_id = "Not targeted\nbarcodes\n(mean)") %>%
bind_rows(targeted, .) %>%
mutate(type = ifelse(type == "new\ngenes",
"Newly\ndetected\ngenes",
ifelse(type == "shared\ngenes",
"Previously\ndetected\ngenes",
ifelse(type == "not\nrecovered\ngenes",
"Previously\ndetected\ngenes\nnot recovered",
NA))))
plt <- ggplot(plt_dat,
aes(cell_id, count)) +
geom_bar(aes(fill = type),
stat = "identity") +
labs(y = "# of genes") +
scale_x_discrete(labels = cell_names) +
scale_y_continuous(labels = scales::comma) +
scale_fill_brewer(palette = "Set1", name = "") +
guides(fill = guide_legend(override.aes = list(size = 0.25))) +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45,
hjust = 1),
legend.position = "top",
plot.margin = unit(c(5.5, 50.5, 5.5, 5.5),
"points"))
# legend.key.size = unit(0.25, "pt"))
save_plot("new_genes_species_specific_barplot.pdf", plt,
base_width = 3.6, base_height = 5) calc_ma <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
x_rn <- rownames(xmat)
y_rn <- rownames(ymat)
xmat <- log2(xmat + 1)
ymat <- log2(ymat + 1)
rownames(xmat) <- x_rn
rownames(ymat) <- y_rn
m <- rowMeans(log2(((2^ymat + 2^xmat) / 2) + 1))
a <- xmat[, cell] - ymat[, cell]
data_frame(gene = names(a),
mean_expression_log2 = m,
log2_diff = a)
}
genes_to_plot <- rownames(sc_objs[[resampled_lib]]$umi_matrix)
cols <- colnames(sc_objs[[resampled_lib]]$norm_umi)
cell_ids <- str_c(cells[[resampled_lib]], "-1")
## append genes to reference library if necessary
ref_mat <- standardize_rows(sc_objs[[resampled_lib]]$umi_matrix[, cols],
sc_objs[[reflib]]$umi_matrix[, cols])
ma_dat <- map(cell_ids,
~calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot, cols],
ref_mat[genes_to_plot, cols],
cell = .x))
names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plot_ma <- function(df){
n_up <- filter(df, log2_diff > 0) %>%
group_by(cell) %>%
summarize(n = n(), n = paste0("up = ", n))
n_down <- filter(df, log2_diff < 0) %>%
group_by(cell) %>%
summarize(n = n(), n = paste0("down = ", n))
if (nrow(n_down) == 0) {
n_down = data_frame(cell = df$cell %>% unique(),
n = "down = 0")
}
plt <- ggplot(df,
aes(mean_expression_log2,
log2_diff)) +
geom_hline(aes(yintercept = 0), linetype = "dashed", colour = "grey") +
geom_point(size = 0.25) +
geom_text(data = n_up, aes(x = max(ma_dat$mean_expression_log2) * 0.9,
y = max(ma_dat$log2_diff) * 1.2,
label = n)) +
geom_text(data = n_down, aes(x = max(ma_dat$mean_expression_log2) * 0.9,
y = min(ma_dat$log2_diff) * 1.2,
label = n)) +
facet_wrap(~cell) +
labs(x = expression(paste("Abundance (log"[2], ")")),
y = expression(paste(frac("resampled","Original"), " (log"[2], ")")))
plt
}
plt <- plot_ma(ma_dat)
pltsave_plot("per_cell_MA_plot_all_genes.pdf", plt, base_height = 6)
## Shared genes
ma_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plt <- plot_ma(ma_dat)
pltsave_plot("per_cell_MA_plot_shared_genes.pdf", plt, base_height = 6)
## New genes
ma_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plt <- plot_ma(ma_dat)
pltsave_plot("per_cell_MA_plot_new_genes.pdf", plt, base_height = 6)get_expr <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
xrows <- rownames(xmat)
xmat <- log2(xmat[, cell] + 1)
ymat <- log2(ymat[xrows, cell] + 1)
data_frame(
gene = xrows,
resampled = xmat,
original = ymat) %>%
gather(library,
Expression, -gene)
}
plot_histogram <- function(df){
ggplot(df,
aes_string("Expression")) +
geom_density(aes_string(fill = "library"),
alpha = 0.66) +
scale_fill_viridis_d(name = "") +
facet_wrap(~cell, nrow = 1) +
theme(legend.position = "top",
strip.text = element_text(size = 8))
}
expr_dat <- map(cell_ids,
function(x){
genes_to_plot <- names(sc_objs[[resampled_lib]]$genes_detected[[x]])
get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
expressed_in_resampled_plt <- plot_histogram(expr_dat)
expressed_in_resampled_pltexpr_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
share_genes_plt <- plot_histogram(expr_dat)
share_genes_pltexpr_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
new_gene_plt <- plot_histogram(expr_dat)
plts <- list(
expressed_in_resampled_plt,
share_genes_plt,
new_gene_plt
)
plt <- plot_grid(plotlist = plts, ncol = 1)
pltsave_plot("expression_histograms.pdf", plt,
ncol = 1, nrow = 3,
base_height = 4,
base_aspect_ratio = 2)get_paired_expr <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
xrows <- rownames(xmat)
xmat <- log2(xmat[, cell] + 1)
ymat <- log2(ymat[xrows, cell] + 1)
data_frame(
gene = xrows,
resampled = xmat,
original = ymat)
}
plot_scatter <- function(df){
ggplot(df,
aes_string("original", "resampled")) +
geom_point(size = 0.5) +
geom_abline(aes(slope = 1, intercept = 0)) +
facet_wrap(~cell, nrow = 1) +
expand_limits(x = 0, y = 0) +
coord_fixed() +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
theme(legend.position = "top",
strip.text = element_text(size = 8))
}
expr_dat <- map(cell_ids,
function(x){
genes_to_plot <- names(sc_objs[[resampled_lib]]$genes_detected[[x]])
get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
expressed_in_resampled_plt <- plot_scatter(expr_dat)
expressed_in_resampled_pltexpr_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
share_genes_plt <- plot_scatter(expr_dat)
share_genes_pltexpr_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
new_gene_plt <- ggplot(expr_dat, aes(cell,resampled)) +
geom_jitter(alpha = 0.55) +
geom_violin(aes(fill = cell)) +
ylim(0, max(expr_dat$resampled) * 1.10) +
scale_fill_brewer(palette = "Set1") +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust = 0.5),
legend.pos = "none",
axis.title.x = element_blank())
new_gene_pltplts <- list(
expressed_in_resampled_plt,
share_genes_plt
)
plt <- plot_grid(plotlist = plts, ncol = 1)
pltsave_plot("expression_scatterplots.pdf", plt,
ncol = 1, nrow = 3,
base_height = 4,
base_aspect_ratio = 2)
save_plot("expression_newgenes_violinplots.pdf", new_gene_plt,
base_height = 6,
base_aspect_ratio = 0.5)compare_umis <- function(path_to_ctrl,
path_to_test,
return_summary = F){
## umi seqs should be produced by ./get_molecule_info
ctrl_umi_seqs <- read_tsv(path_to_ctrl,
col_names = c("barcode_10x",
"umi_molecule",
"count")) %>%
filter(barcode_10x != "Cell_unmatched")
test_umi_seqs <- read_tsv(path_to_test,
col_names = c("barcode_10x",
"umi_molecule",
"count")) %>%
filter(barcode_10x != "Cell_unmatched")
umi_seqs <- full_join(ctrl_umi_seqs,
test_umi_seqs,
by = c("barcode_10x", "umi_molecule"))
if (return_summary) {
umi_seqs %>%
mutate(new_umi = ifelse(is.na(count.x) & !is.na(count.y),
1L,
0L),
not_detected_umi = ifelse(!is.na(count.x) & is.na(count.y),
1L,
0L),
shared_umi = ifelse(!is.na(count.x) & !is.na(count.y),
1L,
0L)) %>%
group_by(barcode_10x) %>%
summarize(new_umis = sum(new_umi),
not_detected_umis = sum(not_detected_umi),
shared_umis = sum(shared_umi))
} else {
umi_seqs
}
}
umi_files <- file.path(data_dir, libs, "umis", "umigroups.txt.gz")
umi_summaries <- map(umi_files[2],
~compare_umis(umi_files[1], .x, return_summary = T))
names(umi_summaries) <- umi_files[2] %>%
str_split(., "/") %>%
map_chr(~.x[7])
umi_summary <- bind_rows(umi_summaries, .id = "library")
umis_recovered <- umi_summary %>%
gather(class, count, -barcode_10x, -library)
## annotate with resampled or not
cell_annot <- data_frame(barcode_10x = c(cells[1], cells),
library = libs,
resampled = T) %>%
unnest()
umis_recovered <- umis_recovered %>%
mutate(resampled = ifelse(str_replace(barcode_10x, "-1", "") %in% unlist(cells),
T,
F)) %>%
arrange(resampled)
plt <- ggplot(umis_recovered,
aes(barcode_10x, count)) +
geom_point(aes(colour = resampled),
size = 0.6,
alpha = 0.75) +
facet_grid(library ~ class) +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
strip.text.y = element_text(size = 12,
margin = margin(0,0.2,0,0.2, "cm"))) +
scale_color_manual(values = color_palette)
plt umi_seqs <- map(umi_files[2],
~compare_umis(umi_files[1], .x, return_summary = F))
names(umi_seqs) <- umi_files[2] %>%
str_split(., "/") %>%
map_chr(~.x[7])
new_umis <- map(umi_seqs,
~filter(.x,
str_replace(barcode_10x, "-1", "") %in% unlist(cells),
!is.na(count.y),
is.na(count.x)) %>%
separate(umi_molecule, c("seq", "genome", "gene"), sep = "::") %>%
select(-starts_with("count")))
old_umis <- map(umi_seqs,
~filter(.x,
str_replace(barcode_10x, "-1", "") %in% unlist(cells),
!is.na(count.x),
!is.na(count.y)) %>%
separate(umi_molecule, c("seq", "genome", "gene"), sep = "::") %>%
select(-starts_with("count")))
umi_edit_dist <- map2(new_umis,
old_umis,
~left_join(.x, .y,
by = c("barcode_10x", "genome", "gene")) %>%
na.omit() %>%
mutate(ed = kentr::get_hamming_pairs(seq.x, seq.y)) %>%
group_by(barcode_10x, seq.y, genome, gene) %>%
summarize(min_ed = min(ed)) %>%
ungroup())
umi_edit_dist <- bind_rows(umi_edit_dist,
.id = "library")
ggplot(umi_edit_dist, aes(barcode_10x,
min_ed)) +
geom_boxplot(coef = Inf) +
facet_wrap(~library, scales = "free_x") +
labs(y = "Minimum edit distance\noriginal vs. new UMIs") +
theme(axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.5),
axis.title.x = element_blank())rm(umi_seqs)library(Seurat)
mat <- sc_objs[[reflib]]$umi_matrix
sobj <- CreateSeuratObject(mat)
sobj <- NormalizeData(sobj)
sobj <- ScaleData(sobj)
sobj <- FindVariableGenes(sobj, do.plot = F, y.cutoff = 0.33)
sobj <- RunPCA(sobj, pc.genes = sobj@var.genes,
pcs.compute = 20,
do.print = F, seed.use = 20180521)
sobj <- RunTSNE(sobj, dims.use = 1:7, seed.use = 20180521)
sobj <- FindClusters(sobj,
dims.use = 1:7,
resolution = 0.6,
print.output = F,
random.seed = 20180521)plt <- TSNEPlot(sobj,
colors.use = c(brewer.pal(12, "Paired"),
brewer.pal(9, "Set1")),
do.label = T) +
labs(title = "NIH3T3 and 293T") +
theme(legend.position = "none")pltsave_plot("original_cells_tsne.pdf", plt,
base_height = 4.25, base_width = 4.25)
plts <- FeaturePlot(sobj, c("mm38::Malat1", "hg38::MALAT1"), do.return = T)plt <- plot_grid(plotlist = plts, nrow = 1)
save_plot("original_cells_mouse_human_markers.pdf", plt,
base_height = 4.25, base_width = 8.5)mat <- sc_objs[[reflib]]$umi_matrix
resampled_ids <- sc_objs[[resampled_lib]]$meta_dat %>%
filter(resampled) %>%
pull(cell_id)
resampled_mat <- sc_objs[[resampled_lib]]$umi_matrix[, resampled_ids]
colnames(resampled_mat) <- str_c(colnames(resampled_mat),
"::",
"resampled")
mat <- as.data.frame(as.matrix(mat)) %>%
rownames_to_column("gene")
resampled_mat <- as.data.frame(as.matrix(resampled_mat)) %>%
rownames_to_column("gene")
combined_mats <- left_join(mat, resampled_mat, by = c("gene"))
combined_mats <- as.data.frame(combined_mats) %>%
column_to_rownames("gene") %>%
as.matrix() %>%
as(., "sparseMatrix")
combined_mats[is.na(combined_mats)] <- 0
sobj <- CreateSeuratObject(combined_mats)
new_ids <- sobj@meta.data %>%
rownames_to_column("cell") %>%
mutate(resampled = ifelse(str_detect(cell, "resampled"),
"resampled",
"not resampled"))
resampled_cell_ids <- new_ids[new_ids$resampled == "resampled", "cell"] %>%
str_replace("::resampled", "")
new_ids <- mutate(new_ids,
resampled = ifelse(cell %in% resampled_cell_ids,
"original cell",
resampled)) %>%
select(cell, resampled) %>%
as.data.frame(.) %>%
column_to_rownames("cell")
sobj <- AddMetaData(sobj, new_ids)
sobj <- NormalizeData(sobj)
sobj <- ScaleData(sobj)
sobj <- FindVariableGenes(sobj, do.plot = T, y.cutoff = 1)sobj <- RunPCA(sobj, pc.genes = rownames(sobj@data),
pcs.compute = 20,
do.print = F, seed.use = 20180522)
sobj <- RunTSNE(sobj, dims.use = 1:7, seed.use = 20180522)
sobj <- FindClusters(sobj,
dims.use = 1:7,
resolution = 0.6,
print.output = F,
random.seed = 20180522)mdata <- sc_objs$mouse_human_cell_pulldown$meta_dat %>%
select(cell_id, resampled, matches("purity_umis"))
mdata_sub <- filter(mdata, resampled) %>%
select(cell_id, resampled, proportion_human = purity_umis) %>%
mutate(cell_id = str_c(cell_id, "::resampled"),
proportion_mouse = 1 - proportion_human)
mdata_ogsub <- filter(mdata, resampled) %>%
select(cell_id, resampled, proportion_human = og_purity_umis) %>%
mutate(proportion_mouse = 1 - proportion_human)
mdata_not_sub <- filter(mdata, !resampled) %>%
select(cell_id, resampled, proportion_human = og_purity_umis) %>%
mutate(proportion_mouse = 1 - proportion_human)
mdata <- bind_rows(list(mdata_sub, mdata_ogsub, mdata_not_sub)) %>%
unique() %>%
mutate(species = ifelse(proportion_human > 0.90,
"Human",
ifelse(proportion_mouse > 0.90,
"Mouse",
"Doublet"))) %>%
as.data.frame() %>%
tibble::column_to_rownames("cell_id") %>%
select(-resampled)
sobj <- AddMetaData(sobj, mdata)
sobj <- SetAllIdent(sobj, "species")
tsne_dat <- GetCellEmbeddings(sobj, reduction.type = "tsne") %>%
as.data.frame() %>%
tibble::rownames_to_column("cell")
mdata <- sobj@meta.data %>%
as.data.frame() %>%
tibble::rownames_to_column("cell")
tsne_dat <- left_join(tsne_dat,
mdata,
by = "cell")
plt <- ggplot(tsne_dat,
aes(tSNE_1, tSNE_2)) +
geom_point(aes(color = species), size = 0.1) +
scale_color_manual(values = brewer.pal(3, "Paired"),
name = "") +
labs(title = "",
x = "tSNE 1",
y = "tSNE 2") +
theme_cowplot() +
guides(color = guide_legend(nrow = 3,
ncol = 1,
override.aes = list(size = 4))) +
theme(legend.position = c(1.05, 1.05),
legend.justification = c("right", "top"),
legend.box.just = "right")
save_plot("original_resampled_cells_mouse_human.pdf", plt,
base_height = 4.25, base_width = 8.5)
og_cell_dat <- tsne_dat %>%
filter(cell %in% str_c(cells[[resampled_lib]], "-1"))
resampled_cell_dat <- tsne_dat %>%
filter(cell %in% str_c(cells[[resampled_lib]],
"-1::resampled")) %>%
mutate(cell = str_replace(cell, "::resampled", "")) %>%
select(cell, tSNE_1, tSNE_2)
cell_dat <- left_join(og_cell_dat,
resampled_cell_dat,
by = "cell",
suffix = c("", "_resampled"))
resampled_cells <- ggplot(tsne_dat,
aes(tSNE_1, tSNE_2)) +
geom_point(aes(color = resampled), size = 0.1) +
scale_color_manual(values = c("lightgrey",
brewer.pal(7, "Set1")[1:2]),
name = "",
labels = c("not resampled" = "Not Resampled",
"original cell" = "Original Cell",
"resampled" = "Resampled Cell")) +
geom_segment(data = cell_dat,
aes(x = tSNE_1,
y = tSNE_2,
xend = tSNE_1_resampled,
yend = tSNE_2_resampled),
linejoin = "mitre",
arrow = arrow(length = unit(0.03, "npc"))) +
labs(title = "",
x = "tSNE 1",
y = "tSNE 2") +
theme_cowplot() +
guides(color = guide_legend(nrow = 3,
ncol = 1,
override.aes = list(size = 4))) +
theme(legend.position = c(1.05, 1.05),
legend.justification = c("right", "top"),
legend.box.just = "right")
plt <- plot_grid(plt,
resampled_cells,
ncol = 2)
pltsave_plot("resampled_tsne.pdf", plt,
nrow = 1, ncol = 2,
base_height = 5.5, base_width = 4.25)saveRDS(sobj, "rs_sobj.rds")Find the k-nearest neighbors in PCA space
## use combined data from above
data.use <- GetCellEmbeddings(object = sobj,
reduction.type = "pca",
dims.use = 1:7)
## findnearest neighboors using exact search
knn <- RANN::nn2(data.use, k = 5,
searchtype = 'standard',
eps = 0)
resampled_idxs <- knn$nn.idx[str_detect(rownames(data.use), "::resampled"), ]
nn_ids <- as_data_frame(t(apply(resampled_idxs, 1,
function(x)rownames(data.use)[x])))
colnames(nn_ids) <- c("query_cell",
paste0("nearest neighbor ", 1:(ncol(nn_ids) - 1)))
nn_ids